# This script prepares the different complete data, oversampled and/or undersampled training datasets to develop the preschool CAPP models. 
# These preschool training datasets prepared in this script will have had the following optimisation techniques applied: ADASYN oversampling and/or random undersampling to give 1:1 class balance
# Once the data is prepared, this script needs to be immediately followed by: "Model_development_XXX.txt", where XXX is the name of the different algorithms considered. 
# The data in file "Preschool_standardised_initial_training_dataset_365IDs.csv" is found in IOWBC_training_test_data.xlsx, sheet: "Standardised preschool training"
# The data in files named "Oversampled_preschool_dataset_XXX.csv" were developed using the script "Data_preparation_CAPP_oversampling.txt (data can be found in XXX).
# Python version 3.6.8 was used 

# Set working directory
os.chdir("/../../")

# Imports
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.utils import shuffle

#######################
### Import datasets ###
#######################
# Construct both oversampled and undersampled datasets in the following way:
# data_0 = complete case		data_0_U = complete case, undersampled
# data_25_O = 25% oversampled cases		data_25_OU = 25% oversampled cases, undersampled controls to 1:1 class ratio

data_0 = pd.read_csv("Preschool_standardised_initial_training_dataset_365IDs.csv", index_col=False)
print('Original dataset shape %s' % Counter(data_0.Asthma_10YR))
# Original dataset shape Counter({0: 314, 1: 51})

# Undersample the controls 
s1 = data_0.loc[data_0['Asthma_10YR'] == 1]
s0 = data_0.loc[data_0['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:51,]
data_0_U = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_0_U = shuffle(data_0_U, random_state=123)
print('Original dataset shape %s' % Counter(data_0_U.Asthma_10YR))
# Original dataset shape Counter({0: 51, 1: 51})


data_25_O = pd.read_csv("Oversampled_preschool_dataset_25%.csv", index_col=False)
data_25_O = data_25_O.iloc[0:378,:]
print('Original dataset shape %s' % Counter(data_25_O.Asthma_10YR))
# Original dataset shape Counter({0: 314, 1: 64})

# Undersample the controls 
s1 = data_25_O.loc[data_25_O['Asthma_10YR'] == 1]
s0 = data_25_O.loc[data_25_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:64,]
data_25_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_25_OU = shuffle(data_25_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_25_OU.Asthma_10YR))
# Original dataset shape Counter({0: 64, 1: 64})


data_50_O = pd.read_csv("Oversampled_preschool_dataset_50%.csv", index_col=False)
data_50_O = data_50_O.iloc[0:391,:]
print('Original dataset shape %s' % Counter(data_50_O.Asthma_10YR))
# Original dataset shape Counter({0: 314, 1: 77})

# Undersample the controls 
s1 = data_50_O.loc[data_50_O['Asthma_10YR'] == 1]
s0 = data_50_O.loc[data_50_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:77,]
data_50_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_50_OU = shuffle(data_50_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_50_OU.Asthma_10YR))
# Original dataset shape Counter({0: 77, 1: 77})


data_100_O = pd.read_csv("Oversampled_preschool_dataset_100%.csv", index_col=False)
data_100_O = data_100_O.iloc[0:416,:]
print('Original dataset shape %s' % Counter(data_100_O.Asthma_10YR))
# Original dataset shape Counter({0: 314, 1: 102})

# Undersample the controls 
s1 = data_100_O.loc[data_100_O['Asthma_10YR'] == 1]
s0 = data_100_O.loc[data_100_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:102,]
data_100_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_100_OU = shuffle(data_100_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_100_OU.Asthma_10YR))
# Original dataset shape Counter({0: 206, 1: 206})


data_150_O = pd.read_csv("Oversampled_preschool_dataset_150%.csv", index_col=False)
data_150_O = data_150_O.iloc[0:442,:]
print('Original dataset shape %s' % Counter(data_150_O.Asthma_10YR))
# Original dataset shape Counter({0: 314, 1: 128})

# Undersample the controls 
s1 = data_150_O.loc[data_150_O['Asthma_10YR'] == 1]
s0 = data_150_O.loc[data_150_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:128,]
data_150_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_150_OU = shuffle(data_150_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_150_OU.Asthma_10YR))
# Original dataset shape Counter({0: 128, 1: 128})


data_200_O = pd.read_csv("Oversampled_preschool_dataset_200%.csv", index_col=False)
data_200_O = data_200_O.iloc[0:467,:]
print('Original dataset shape %s' % Counter(data_200_O.Asthma_10YR))
# Original dataset shape Counter({0: 314, 1: 153})

# Undersample the controls 
s1 = data_200_O.loc[data_200_O['Asthma_10YR'] == 1]
s0 = data_200_O.loc[data_200_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:153,]
data_200_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_200_OU = shuffle(data_200_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_200_OU.Asthma_10YR))
# Original dataset shape Counter({0: 153, 1: 153})


data_250_O = pd.read_csv("Oversampled_preschool_dataset_250%.csv", index_col=False)
data_250_O = data_250_O.iloc[0:493,:]
print('Original dataset shape %s' % Counter(data_250_O.Asthma_10YR))
# Original dataset shape Counter({0: 314, 1: 179})

# Undersample the controls 
s1 = data_250_O.loc[data_250_O['Asthma_10YR'] == 1]
s0 = data_250_O.loc[data_250_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:179,]
data_250_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_250_OU = shuffle(data_250_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_250_OU.Asthma_10YR))
# Original dataset shape Counter({0: 179, 1: 179})


data_300_O = pd.read_csv("Oversampled_preschool_dataset_300%.csv", index_col=False)
data_300_O = data_300_O.iloc[0:518,:]
print('Original dataset shape %s' % Counter(data_300_O.Asthma_10YR))
# Original dataset shape Counter({0: 314, 1: 204})

# Undersample the controls 
s1 = data_300_O.loc[data_300_O['Asthma_10YR'] == 1]
s0 = data_300_O.loc[data_300_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:204,]
data_300_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_300_OU = shuffle(data_300_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_300_OU.Asthma_10YR))
# Original dataset shape Counter({0: 204, 1: 204}

# Assign all training datasets to be considered for model development into data object
data = [];
data.append(data_0);
data.append(data_25_O);
data.append(data_50_O);
data.append(data_100_O);
data.append(data_150_O);
data.append(data_200_O);
data.append(data_250_O);
data.append(data_300_O);
data.append(data_0_U);
data.append(data_25_OU);
data.append(data_50_OU);
data.append(data_100_OU);
data.append(data_150_OU);
data.append(data_200_OU);
data.append(data_250_OU);
data.append(data_300_OU)

# Set should be indexed according to the number of datasets included in the object data. This will be used during model development to loop through each training dataset.  
set = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

# Import preschool test data, standardised against the initial preschool training dataset - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised preschool test set"
test = pd.read_csv("Preschool_standardised_test_dataset_183IDs.csv", index_col=False)
# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']

